# Simulation Main

## Simulation for expenditure Benford's Law Tests
# On data with scattered rounding

N <- 10000
D = data.frame(matrix(ncol = 0, nrow = N))
D$District <- sample(c("A", "B", "C", "D", "E"), N, replace=T)
D$Categories <- sample(c("W", "X", "Y", "Z"), N, replace=T)
D$Year <- sample(c("2000", "2001", "2002", "2003", "2004"), N, replace=T)


## Create a lot of Benford conforming data
mantissa = exp(runif(N)*log(10))
power = sample(seq(3,7),N, replace = T)
D$Expense = mantissa * 10^power 
D$Expense = floor(D$Expense)

## Conduct assorted rounding
## On 10% of the sample
D$ExpenseRound = runif(N) < 0.1 
for(i in 1:nrow(D)){
	if(D$ExpenseRound[i]){
		D$Expense[i] <- signif(D$Expense[i], sample(1:5,1))
	}
}


#Run our tests against it
#library(devtools)
#install_github("jlederluis/digitanalysis", force = TRUE)

library(digitanalysis)
library(ggplot2)

Data <- process_digit_data(raw_df = D, digit_columns = c('Expense'))


#######################################
# Plot toggle: turn on before plotting
plot_toggle = TRUE 
# Scale plots to match ELL 

# First digit test
first_digit = all_digits_test(digitdata = Data, data_columns = 'Expense', digit_places = 1, omit_05 = 0, break_out='District',
                              suppress_first_division_plots=TRUE, plot=plot_toggle)

pdf("FirstDigits.pdf")
first_digit$plots$AllBreakout$AllCategory$aggregate_barplot +  scale_y_continuous(labels = scales::percent) + coord_cartesian(ylim =c(0, .3))
dev.off()

#All digits test except first with expenditure
all_digits = all_digits_test(digitdata = Data, data_columns = 'Expense', skip_first_digit = TRUE, omit_05 = c(0,5), break_out='District',
                            suppress_first_division_plots=TRUE, plot=plot_toggle)
pdf("AllDigits.pdf")
all_digits$plots$AllBreakout$AllCategory$aggregate_barplot + ggtitle("") + theme(legend.position = "none") + xlab("Digits from All Digit Places Beyond First \n (Excluding 0s and 5s)") +  scale_y_continuous(labels = scales::percent) + coord_cartesian(ylim =c(0, .15))
dev.off()

# Padding test

padding = padding_test(digitdata = Data, data_columns = 'Expense', break_out = "District", max_length=7, num_digits=5, N=1000, omit_05=c(0,5),
                       simulate=T, suppress_first_division_plots=TRUE, plot=plot_toggle)
pdf("Padding.pdf")
padding$plots$All+ ggtitle("") + coord_cartesian(ylim =c(-0.15, 0.75))
dev.off()

# Digit pairs
digit_pair = digit_pairs_test(digitdata = Data, data_columns = 'Expense', omit_05 = 0, break_out='District', plot=plot_toggle)
pdf("DigitPair.pdf") 
digit_pair$plot + geom_hline(aes(yintercept=9/100, linetype="Uniform Distribution"), color='red', lwd=1) +  ggtitle("") + xlab("District") + scale_y_continuous(labels = scales::percent) + coord_cartesian(ylim =c(0, .18))
dev.off()


# High low 
high_low = high_low_test(digitdata = Data, data_columns = 'Expense', omit_05 = c(0,5), skip_first_digit=TRUE, break_out=NA, category='Year', plot=plot_toggle, remove_all_category_visualize = T)
high_low$plot

pdf("HighLow.pdf")
high_low$plot +ggtitle("") +xlab("") +theme(axis.text.x = element_blank())+ ylab("% High Digits")+ coord_cartesian(ylim =c(.3, .55)) + scale_fill_manual(values=c("#b8b8b8", "#919191", "#7a7a7a", "#616161", "#525252")) + scale_y_continuous(breaks = seq(.3, .55, by = .05), labels = scales::percent) 
dev.off()



########################################################################
#Repeat test with expenditure
repeats = repeat_test(digitdata = Data, duplicate_matching_cols=c("Expense", "Year", "District", "Categories"),
                      break_out='District', data_column = 'Expense', plot=plot_toggle)

pdf("Repeats.pdf")
repeats$plot +ggtitle("") + xlab("District")+ scale_y_continuous(labels = scales::percent)
dev.off()




########################################################################
#Rounding test with expenditure
rounding = rounding_test(digitdata = Data, data_columns = 'Expense', break_out='District',
                         rounding_patterns = c('0','00','000','0000', '00000', '000000', '5', '50', '500'), plot=plot_toggle)

pdf("Rounding.pdf")
rounding$plot +ggtitle("")+ xlab("District") + scale_y_continuous(labels = scales::percent)
dev.off()



########################################################################
#Repeat test by Sector
sector = sector_test(digitdata = Data, category='Categories', duplicate_matching_cols=c("Expense", "Year", "District", "Categories"),
                     break_out='District', data_column = 'Expense',
                     category_instance_analyzing = 'W', plot=plot_toggle, remove_all_category_visualize = T)

pdf("Sector.pdf")
sector$plot +ggtitle("")+scale_y_continuous(labels = scales::percent)
dev.off()

### Make figs





# P-Value printout to produce final table

sink(file = "P-Values.txt")

print("First Digits")
first_digit$p_values


print("AllDigits")
all_digits$p_values


print("Digit Pairs")
digit_pair$p_values

print("Sector Repeats")
sector$p_values

print("Repeats")
repeats$p_values

print("Rounding")
rounding$p_values


print("High-Low by Year")
high_low$p_values
  
sink()


